#!pip install pandas==0.25
import pandas as pd
pd.__version__
# Import modules
import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import pandas_profiling as pp
import statsmodels.api as sm
from sklearn.model_selection import KFold,cross_val_score, train_test_split, GridSearchCV, learning_curve, validation_curve, RepeatedKFold
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, ElasticNet, Lasso, Ridge, BayesianRidge, LassoLarsIC
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, RFECV, SelectKBest, f_regression
# Stats
from scipy import stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000
# Figures inline and set visualization style
%matplotlib inline
sns.set()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from google.colab import files
uploaded = files.upload()
import io
df_train = pd.read_csv(io.BytesIO(uploaded['train.csv']))
print(df_train.head(n=4))
df_train.shape
n_train = df_train.shape[0]
print(n_train)
#pp.ProfileReport(df_train)
uploaded = files.upload()
df_test = pd.read_csv(io.BytesIO(uploaded['test.csv']))
print(df_test.shape)
df_test.head(n=4)
SalePrice_train = df_train.SalePrice
data = pd.concat([df_train.drop(['SalePrice'], axis=1), df_test])
df_train.info()
df_train.SalePrice.describe()
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [8, 5]})
sns.histplot(df_train['SalePrice'],kde=True)
plt.title("Histogram for SalePrice")
# Skew and kurt
print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())
Figure 1.1: distribution of the dependent variable sale prices
df_train.SalePrice.plot.box()
plt.tight_layout(pad=0.5)
Figure 1.2: box plot of the dependent variable sale prices
stats.probplot(df_train.SalePrice, plot=sns.mpl.pyplot)
Figure 1.3: Q-Q plot of the dependent variable sale prices
df_train.describe()
data_percent_missing = data.isnull().sum() * 100 / len(data)
data_missing_value = pd.DataFrame({'column_name': data.columns,'percent_missing': data_percent_missing})
data_missing_value = data_missing_value.sort_values('percent_missing',ascending=False)
data_missing_value['dtypes'] = data.dtypes
data_missing_value = data_missing_value[data_missing_value.percent_missing>0]
print(data_missing_value)
data_missing_object = data_missing_value[data_missing_value['dtypes']=='object']['column_name'].tolist()
data_missing_num = data_missing_value[data_missing_value['dtypes']!='object']['column_name'].tolist()
print(data_missing_object)
print(data_missing_num)
Figure 4.2: imputation for missing data
for col in data_missing_object:
data[col] = data[col].fillna('_NA_');
for col in data_missing_num:
data[col] = data[col].fillna(0);
#for col in ['GarageYrBlt']:
# data[col] = data.groupby('Neighborhood')[col].transform(lambda x: x.fillna(x.median()))
data_percent_missing = data.isnull().sum() * 100 / len(data)
data_missing_value = pd.DataFrame({'column_name': data.columns,'percent_missing': data_percent_missing})
data_missing_value = data_missing_value.sort_values('percent_missing',ascending=False)
data_missing_value['dtypes'] = data.dtypes
data_missing_value[data_missing_value.percent_missing>0]
data.head()
Figure 4.1: feature creation
data['TotalSF'] = data['1stFlrSF'] + data['2ndFlrSF'] + data['TotalBsmtSF']
data['TotalPorchSF'] = data['OpenPorchSF']+data['EnclosedPorch']+data['3SsnPorch']+data['ScreenPorch']+data['WoodDeckSF']
data['HouseAge'] = data.YrSold - data.YearBuilt
data['QualityIndex'] = data.OverallQual * data.OverallCond
data['Total_Bathrooms'] = data.BsmtFullBath + .5*data.BsmtHalfBath + data.FullBath + .5*data.HalfBath
data['Has_Fireplaces'] = np.where(data['Fireplaces']>=1, 1, 0)
data['Has_Bsmt'] = np.where(data['TotalBsmtSF']>=0, 1, 0)
data['Has_Garage'] = np.where(data['GarageArea']>=0, 1, 0)
data['Has_Pool'] = np.where(data['PoolArea']>=0, 1, 0)
data['Has_2ndStory'] = np.where(data['2ndFlrSF']>=0, 1, 0)
data.head()
Figure 2.1: list of columns of the percentage of missing values
df_train_num = df_train.select_dtypes(include=np.number)
df_train_num_predictors = df_train_num.drop(['SalePrice'], axis=1)
print(df_train_num_predictors.shape)
df_train_num_predictors.hist(bins=10, figsize=(20, 25), layout=(8, 5));
# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 150))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
sns.color_palette("husl", 8)
for i, feature in enumerate(list(df_train_num_predictors), 1):
if(feature=='MiscVal'):
break
plt.subplot(len(list(df_train_num_predictors)), 3, i)
sns.scatterplot(x=feature, y='SalePrice', hue='SalePrice', palette='Blues', data=df_train)
plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
plt.ylabel('SalePrice', size=15, labelpad=12.5)
for j in range(2):
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc='best', prop={'size': 10})
plt.show()
Figure 3.1: scatter plots of continuous variables versus the sale price
# Outliers
print(df_train[(df_train.GrLivArea>4500) & (df_train.SalePrice<200000)])
print(df_train[(df_train['1stFlrSF']>4500) & (df_train.SalePrice<200000)])
print(df_train[(df_train.LotFrontage>300)])
Figure 2.2: list of outliers
df_train_categorical = df_train.select_dtypes(exclude=np.number)
print("Categorical:", df_train_categorical.shape)
df_train_num = df_train.select_dtypes(include=np.number)
df_train_num = df_train_num.drop(['Id'], axis=1)
print("Numerical:", df_train_num.shape)
plt.subplots(figsize=(38, 38))
sns.heatmap(df_train_num.corr(), annot = True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', fmt='.1g')
Figure 3.2: correlation matrix
# visualising some more outliers in the data values
fig, ax = plt.subplots(15, 3, figsize=(20, 100))
for var, subplot in zip(df_train_categorical, ax.flatten()):
sns.countplot(x=var, data=df_train, ax=subplot)
Figure 3.3: bar plot of categorical variables
fig, ax = plt.subplots(15, 3, figsize=(20, 100))
for var, subplot in zip(df_train_categorical, ax.flatten()):
sns.boxplot(x=var, y='SalePrice', data=df_train, ax=subplot)
Figure 3.4: box plot of categorical variables
fig = plt.figure(figsize = (25,60))
sns.countplot(x='Neighborhood', data=df_train, ax=fig.add_subplot(6,1,1));
sns.boxplot(x='Neighborhood', y='SalePrice', data=df_train, ax=fig.add_subplot(6,1,2));
sns.countplot(x='Exterior1st', data=df_train, ax=fig.add_subplot(6,1,3));
sns.boxplot(x='Exterior1st', y='SalePrice', data=df_train, ax=fig.add_subplot(6,1,4));
sns.countplot(x='Exterior2nd', data=df_train, ax=fig.add_subplot(6,1,5));
sns.boxplot(x='Exterior2nd', y='SalePrice', data=df_train, ax=fig.add_subplot(6,1,6));
print(list(data.columns))
Figure 4.3: encode categorical variables
#data = pd.get_dummies(data, columns=list(df_train_categorical.columns), drop_first=True)
data.head()
# Fetch all numeric features
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in data.columns:
if data[i].dtype in numeric_dtypes:
numeric.append(i)
# Create box plots for all numeric features
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=data[numeric] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)
# Find skewed numerical features
skew_features = data[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index
print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features.head(10)
Figure 6.5: Box Cox to skewed features
# Normalize skewed features
for i in skew_index:
print(i)
data[i] = data[i]+1
data[i] = boxcox1p(data[i], boxcox_normmax(data[i]+1))
# Let's make sure we handled all the skewed values
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=data[skew_index] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)
def logs(res, ls):
m = res.shape[1]
for l in ls:
res = res.assign(newcol=pd.Series(np.log(1.01+res[l])).values)
res.columns.values[m] = l + '_log'
m += 1
return res
log_features = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd','TotalSF','HouseAge']
data = logs(data, log_features)
Figure 6.6: Log transformation
def squares(res, ls):
m = res.shape[1]
for l in ls:
res = res.assign(newcol=pd.Series(res[l]*res[l]).values)
res.columns.values[m] = l + '_sq'
m += 1
return res
squared_features = ['YearRemodAdd', 'LotFrontage_log',
'TotalBsmtSF_log', '1stFlrSF_log', '2ndFlrSF_log', 'GrLivArea_log',
'GarageCars_log', 'GarageArea_log','TotalSF_log','HouseAge_log']
data = squares(data, squared_features)
Figure 6.7: Square of Log transformation
data= pd.get_dummies(data).reset_index(drop=True)
print(data.shape)
data = data.loc[:,~data.columns.duplicated()]
print(data.shape)
Figure 6.8: Converts categorical data into dummy or indicator variables
df_train = data.iloc[:n_train]
df_test = data.iloc[n_train:]
print(df_train.shape)
print(df_test.shape)
df_train['SalePrice'] = SalePrice_train
df_train = df_train[~df_train.Id.isin([1299,524,935])]
print(df_train.shape)
df_train.head()
Figure 5.1: perform both min-max and standard scaling on the dependent variable
# log(1+x) transform
df_train["Log1p_SalePrice"] = np.log1p(df_train['SalePrice'] )
# define standard scaler
scaler = StandardScaler()
df_train["StandardScal_SalePrice"] = scaler.fit_transform(df_train[['SalePrice']] )
# define max-min scaler
scaler = MinMaxScaler()
df_train["MaxMinScal_SalePrice"] = scaler.fit_transform(df_train[['SalePrice']] )
df_train.head()
df_train.dtypes
# Finding numeric features
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in df_train.columns:
if df_train[i].dtype in numeric_dtypes:
numeric.append(i)
# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 200))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
sns.color_palette("husl", 8)
for i, feature in enumerate(list(df_train[numeric]), 1):
if(feature=='MiscVal'):
break
plt.subplot(len(list(numeric)), 3, i)
sns.scatterplot(x=feature, y='Log1p_SalePrice', hue='Log1p_SalePrice', palette='Blues', data=df_train)
plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
plt.ylabel('Log1p_SalePrice', size=15, labelpad=12.5)
for j in range(2):
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc='best', prop={'size': 10})
plt.show()
Figure 7.1: scatter plots of continuous variables versus Log1p(sale price)
Figure 7.1 Junk Model with only one variable OverallQual
# junk model with OverallQual only
from google.colab import files
saleprice_overallqual = df_train.groupby(['OverallQual'])['SalePrice'].mean()
print(saleprice_overallqual)
df_test_junk = pd.merge(df_test[['Id','OverallQual']],saleprice_overallqual, how='left', on='OverallQual')
df_test_junk.describe()
df_test_junk[['Id', 'SalePrice']].to_csv('OverallQual_SalePrice.csv', index=False)
#files.download('OverallQual_SalePrice.csv')
Figure 6.9 Average SalePrice by OverallQual
# split df_train dataset
X_train, X_test, y_train, y_test = train_test_split(df_train[df_train.columns.drop(list(df_train.filter(regex='SalePrice')))], df_train[['Log1p_SalePrice']], test_size=0.2, random_state=321)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
X_train.head()
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [8, 5]})
sns.histplot(df_train['Log1p_SalePrice'],kde=True)
plt.title("Histogram for Log1p_SalePrice")
# Skew and kurt
print("Skewness: %f" % df_train['Log1p_SalePrice'].skew())
print("Kurtosis: %f" % df_train['Log1p_SalePrice'].kurt())
Figure 6.3: Histogram of Log1p_SalePrice
# score junk model
x_train_junk = pd.merge(X_train[['Id','OverallQual']],saleprice_overallqual, how='left', on='OverallQual')
print(np.sqrt(mean_squared_error(y_train['Log1p_SalePrice'], np.log1p(x_train_junk['SalePrice'])))) #0.22977945196526697
x_test_junk = pd.merge(X_test[['Id','OverallQual']],saleprice_overallqual, how='left', on='OverallQual')
print(np.sqrt(mean_squared_error(y_test['Log1p_SalePrice'], np.log1p(x_test_junk['SalePrice'])))) #0.21681551289920958
#print(y_train['Log1p_SalePrice'].head())
residuals = y_test['Log1p_SalePrice']- np.log1p(x_test_junk['SalePrice'])
mean_residuals = np.mean(residuals)
print(mean_residuals)
print("R squared: {}".format(r2_score(y_true=y_test['Log1p_SalePrice'],y_pred=np.log1p(x_test_junk['SalePrice']))))
# Detecting heteroscedasticity
fig = plt.figure(figsize = (20,20))
y_pred = np.log1p(x_test_junk['SalePrice'])
p1 = sns.scatterplot(y_pred,residuals,ax=fig.add_subplot(2,2,1))
plt.xlabel('predicted values')
plt.ylabel('Residuals')
p1 = plt.title('Residuals vs fitted values plot for homoscedasticity check')
p2 = sns.distplot(residuals,kde=True,ax=fig.add_subplot(2,2,2))
p2 = plt.title('Normality of error terms/residuals')
Figure 7.2 Heteroscedasticity and normality of residuals
Figure 9.1: Junk model in Kaggle
Figure 6.1 split train and test on the training dataset
list(df_train.filter(regex='SalePrice'))+['Id']
Figure 6.2: cross validation
# Setup cross validation folds
kf = KFold(n_splits=10, random_state=321, shuffle=True)
# Define error metrics
def k_rmse(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))
def cv_rmse(model, X_train, y_train):
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=kf))
return (rmse)
scores = {}
# LinearRegression
names=pd.DataFrame(X_train.columns)
linear = LinearRegression().fit(X_train, y_train)
score = cv_rmse(linear,X_train, y_train)
print("linear: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['linear'] = (score.mean(), score.std())
print(pd.DataFrame(linear.coef_.transpose()).shape)
print(names.shape)
#X_train.columns.shape
coeff_df = pd.concat([pd.DataFrame(linear.coef_.transpose(),columns=['coeff']), pd.DataFrame(X_train.columns,columns=['colname'])], axis=1)
coeff_df.head(10)
## creating function to get model statistics
def get_stats():
results = sm.OLS(y_train, X_train).fit()
print(results.summary())
get_stats()
# Make predictions using the testing set
x_test_linear = pd.DataFrame()
print(pd.DataFrame(linear.predict(X_test),columns=['Log1p_SalePrice']).head())
x_test_linear = pd.DataFrame(linear.predict(X_test),columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_linear['Log1p_SalePrice']))) #0.2206744703562279
#use linear regression as the model
lr = LinearRegression()
rfe_mod = RFECV(lr, step=1, cv=10)
linear2=rfe_mod.fit(X_train, y_train) #to fit
# Make predictions using the testing set
x_test_linear2 = pd.DataFrame()
linear_test2 = linear2.predict(X_test)
x_test_linear2['Log1p_SalePrice'] = linear_test2
print(np.sqrt(mean_squared_error(y_test, x_test_linear2['Log1p_SalePrice']))) #0.18839622722363722
# check out which features were selected
rfe_mod.support_
var_important = pd.DataFrame(rfe_mod.support_,index=X_train.columns,columns=['Rank'])
print(var_important[var_important['Rank']==True])
var_imp = var_important[var_important['Rank']==True].index.tolist()
print(var_imp)
linear2 = LinearRegression().fit(X_train[var_imp], y_train)
score = cv_rmse(linear,X_train[var_imp], y_train)
print("linear: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['linear'] = (score.mean(), score.std())
## creating function to get model statistics
def get_stats():
results = sm.OLS(y_train, X_train[var_imp]).fit()
print(results.summary())
get_stats()
list(X_train.columns)
Figure 7.3: Handpicked Linear Model
select_vars0 = ['TotRmsAbvGrd','TotalSF','GrLivArea','GarageCars']
select_vars1 = ['LotArea_log','TotalSF_log','GarageCars_log','HouseAge_log']
select_vars2 = ['TotalSF_log_sq','HouseAge_log_sq']
oth_var = ['KitchenAbvGr','Total_Bathrooms','PoolArea', 'QualityIndex','Fireplaces']
oth_ind = ['Has_Bsmt', 'Has_Garage', 'Has_Pool', 'Has_2ndStory',
'Neighborhood_BrkSide','Neighborhood_Crawfor', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt','Neighborhood_StoneBr',
'Id']
features = select_vars1+select_vars2+oth_var+oth_ind
# LinearRegression
X_train0 = X_train[features]
plt.figure(figsize=(20,20)) # on this line I just set the size of figure to 12 by 10.
p=sns.heatmap(X_train[select_vars1+oth_var].corr(), annot=True,cmap='RdYlGn',square=True)
linear3 = LinearRegression().fit(X_train0, y_train)
score = cv_rmse(linear,X_train0, y_train)
print("linear: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['linear'] = (score.mean(), score.std())
# Make predictions using the testing set
x_test_linear = pd.DataFrame(linear3.predict(X_test[features]),columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_linear['Log1p_SalePrice']))) #0.11475766913083842
Figure 7.4: Multicollinearity
## creating function to get model statistics
def get_stats():
results = sm.OLS(y_train, X_train0).fit()
print(results.summary())
get_stats()
y_pred = x_test_linear['Log1p_SalePrice']
residuals = y_test['Log1p_SalePrice']- y_pred
mean_residuals = np.mean(residuals)
print(mean_residuals)
print("R squared: {}".format(r2_score(y_true=y_test['Log1p_SalePrice'],y_pred=y_pred)))
# Detecting heteroscedasticity
fig = plt.figure(figsize = (20,20))
p1 = sns.scatterplot(y_pred,residuals,ax=fig.add_subplot(2,2,1))
plt.xlabel('predicted values')
plt.ylabel('Residuals')
p1 = plt.title('Residuals vs fitted values plot for homoscedasticity check')
p2 = sns.distplot(residuals,kde=True,ax=fig.add_subplot(2,2,2))
p2 = plt.title('Normality of error terms/residuals')
df_test_linear = pd.DataFrame()
print(df_test.Id.head())
df_test_linear = pd.DataFrame(linear3.predict(df_test[features]),columns=['Log1p_SalePrice'])
print(df_test_linear.head())
df_test_linear['SalePrice'] = np.expm1(df_test_linear['Log1p_SalePrice'])
df_test_linear['Id'] = df_test['Id'].values
df_test_linear[['Id', 'SalePrice']].to_csv('LinearRegression_SalePrice.csv', index=False)
print(df_test_linear.head())
#files.download('LinearRegression_SalePrice.csv') #0.14259
print(df_test_linear['SalePrice'].describe())
print(df_test_junk['SalePrice'].describe())
Figure 9.2: Handpicked Linear Model in Kaggle
Principal Component Analysis
# Standardizing the features
npca = 50
X_trainPCA = StandardScaler().fit_transform(X_train)
pca = PCA(n_components=npca)
pca_train = pca.fit_transform(X_trainPCA)
pca_trainDF = pd.DataFrame(data = pca_train, columns = ["col"+str(i) for i in range(1, npca+1)])
X_testPCA = StandardScaler().fit_transform(X_test)
pca_test = pca.transform(X_testPCA)
pca_testDF = pd.DataFrame(data = pca_test, columns = ["col"+str(i) for i in range(1, npca+1)])
pca.explained_variance_ratio_
# LinearRegression
names=pd.DataFrame(X_train.columns)
linearPCA = LinearRegression().fit(pca_trainDF, y_train)
score = cv_rmse(linearPCA,pca_trainDF, y_train)
print("linear: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['linearPCA'] = (score.mean(), score.std())
print(pca.explained_variance_ratio_)
print(pca_trainDF.head())
print(pca_testDF.head())
# Make predictions using the testing set
x_test_linear = pd.DataFrame()
print(pd.DataFrame(linearPCA.predict(pca_testDF),columns=['Log1p_SalePrice']).head())
x_test_linear = pd.DataFrame(linearPCA.predict(pca_testDF),columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_linear['Log1p_SalePrice']))) #0.1107754992636765
df_test_linearPCA = pd.DataFrame()
print(df_test.Id.head())
df_testPCA = StandardScaler().fit_transform(df_test)
pca_df_test = pca.transform(df_testPCA)
pca_df_testDF = pd.DataFrame(data = pca_df_test, columns = ["col"+str(i) for i in range(1, npca+1)])
df_test_linearPCA = pd.DataFrame(linearPCA.predict(pca_df_testDF),columns=['Log1p_SalePrice'])
print(df_test_linearPCA.head())
df_test_linearPCA['SalePrice'] = np.expm1(df_test_linearPCA['Log1p_SalePrice'])
df_test_linearPCA['Id'] = df_test['Id'].values
df_test_linearPCA[['Id', 'SalePrice']].to_csv('LinearRegressionPCA_SalePrice.csv', index=False)
print(df_test_linearPCA.head())
#files.download('LinearRegressionPCA_SalePrice.csv') #.13759
print(df_test_junk['SalePrice'].describe())
print(df_test_linear['SalePrice'].describe())
print(df_test_linearPCA['SalePrice'].describe())
Figure 8.1 Support Vector Regressor, KRR, Ridge and Lasso
# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))
# KRR
krr = make_pipeline(RobustScaler(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5))
# Ridge Regressor
ridge_alphas = [9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf))
# Lasso Regressor
lasso_alphas = arange (0.0001, 0.002, 0.0001)
lasso = make_pipeline(RobustScaler(), LassoCV(alphas =lasso_alphas, cv=kf))
score = cv_rmse(svr,X_train, y_train)
print("svr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['svr'] = (score.mean(), score.std())
svr =svr.fit(X_train, y_train)
x_test_svr = pd.DataFrame()
svr_test = svr.predict(X_test)
x_test_svr['Log1p_SalePrice'] = svr_test
print(np.sqrt(mean_squared_error(y_test, x_test_svr['Log1p_SalePrice'])))
residuals = y_test['Log1p_SalePrice']- x_test_svr['Log1p_SalePrice']
mean_residuals = np.mean(residuals)
print(mean_residuals)
print("R squared: {}".format(r2_score(y_true=y_test['Log1p_SalePrice'],y_pred=x_test_svr['Log1p_SalePrice'])))
# Detecting heteroscedasticity
fig = plt.figure(figsize = (20,20))
y_pred = x_test_svr['Log1p_SalePrice']
p1 = sns.scatterplot(y_pred,residuals,ax=fig.add_subplot(2,2,1))
plt.xlabel('predicted values')
plt.ylabel('Residuals')
p1 = plt.title('Residuals vs fitted values plot for homoscedasticity check')
p2 = sns.distplot(residuals,kde=True,ax=fig.add_subplot(2,2,2))
p2 = plt.title('Normality of error terms/residuals')
df_test.head()
df_test_svr = df_test[df_test.columns.drop(list(df_test.filter(regex='SalePrice')))]
df_test_svr['Log1p_SalePrice'] = svr.predict(df_test_svr)
df_test_svr['SalePrice'] = np.expm1(df_test_svr['Log1p_SalePrice'])
df_test_svr[['Id', 'SalePrice']].to_csv('SVR_SalePrice.csv', index=False)
#files.download('SVR_SalePrice.csv') #0.12749
Figure 9.3: SVR in Kaggle
score = cv_rmse(krr,X_train, y_train)
print("krr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['krr'] = (score.mean(), score.std())
krr =krr.fit(X_train, y_train)
krr_test = krr.predict(X_test)
x_test_krr = pd.DataFrame(krr_test, columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_krr['Log1p_SalePrice'])))
Figure 8.2: Ridge Model
score = cv_rmse(ridge,X_train, y_train)
print("ridge: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['ridge'] = (score.mean(), score.std())
ridge =ridge.fit(X_train, y_train)
ridge_test = ridge.predict(X_test)
x_test_ridge = pd.DataFrame(ridge_test, columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_ridge['Log1p_SalePrice'])))
residuals = y_test['Log1p_SalePrice']- x_test_ridge['Log1p_SalePrice']
mean_residuals = np.mean(residuals)
print(mean_residuals)
print("R squared: {}".format(r2_score(y_true=y_test['Log1p_SalePrice'],y_pred=x_test_ridge['Log1p_SalePrice'])))
# Detecting heteroscedasticity
fig = plt.figure(figsize = (20,20))
y_pred = x_test_ridge['Log1p_SalePrice']
p1 = sns.scatterplot(y_pred,residuals,ax=fig.add_subplot(2,2,1))
plt.xlabel('predicted values')
plt.ylabel('Residuals')
p1 = plt.title('Residuals vs fitted values plot for homoscedasticity check')
p2 = sns.distplot(residuals,kde=True,ax=fig.add_subplot(2,2,2))
p2 = plt.title('Normality of error terms/residuals')
df_test_ridge = df_test[df_test.columns.drop(list(df_test.filter(regex='SalePrice')))]
df_test_ridge['Log1p_SalePrice'] = ridge.predict(df_test_ridge)
df_test_ridge['SalePrice'] = np.expm1(df_test_ridge['Log1p_SalePrice'])
df_test_ridge[['Id', 'SalePrice']].to_csv('Ridge_SalePrice.csv', index=False)
#files.download('Ridge_SalePrice.csv') #0.12431
Figure 9.4: Ridge Model in Kaggle
Figure 8.3: Lasso Model
score = cv_rmse(lasso,X_train, y_train)
print("lasso: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['lasso'] = (score.mean(), score.std())
lasso =lasso.fit(X_train, y_train)
lasso_test = lasso.predict(X_test)
x_test_lasso = pd.DataFrame(lasso_test, columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_lasso['Log1p_SalePrice'])))
df_test_lasso = df_test[df_test.columns.drop(list(df_test.filter(regex='SalePrice')))]
df_test_lasso['Log1p_SalePrice'] = lasso.predict(df_test_lasso)
df_test_lasso['SalePrice'] = np.expm1(df_test_lasso['Log1p_SalePrice'])
df_test_lasso[['Id', 'SalePrice']].to_csv('Lasso_SalePrice.csv', index=False)
#files.download('Lasso_SalePrice.csv') #0.12749
Figure 9.5: Lasso Model in Kaggle
Elastic Net Regression
alphas = arange (0.0001, 0.002, 0.0001)
len(alphas)
# Definning models
model_ridge = Ridge()
model_lasso = Lasso()
model_elasticNet = ElasticNet(l1_ratio=0.99)
# Tunning hyperparameter
# In this example we use alpha as our hyperparameter lambda.
ridge_alphas = list(arange (0.0001, 0.002, 0.0001)) + [0.1, 0.3, 1, 3, 5, 10, 15]
lasso_alphas = arange (0.0001, 0.002, 0.0001)
# Evaluating models through k-fold cross-validation defined earlier
cv_ridge = [cv_rmse(Ridge(alpha = alpha),X_train, y_train).mean()
for alpha in ridge_alphas]
cv_lasso = [cv_rmse(Lasso(alpha = alpha),X_train, y_train).mean()
for alpha in lasso_alphas]
cv_elasticNet = [cv_rmse(ElasticNet(alpha = alpha),X_train, y_train).mean()
for alpha in lasso_alphas]
cv_ridge = pd.Series(cv_ridge, index = ridge_alphas)
cv_lasso = pd.Series(cv_lasso, index = lasso_alphas)
cv_elasticNet = pd.Series(cv_elasticNet, index = lasso_alphas)
# In this example we use ratios as our hyperparameter l1_ratio.
ratios = arange(0, 1, 0.01)
cv_elasticNet2 = [cv_rmse(ElasticNet(l1_ratio = ratio),X_train, y_train).mean()
for ratio in ratios]
plt.figure(figsize=(20,10))
plt.subplot(251)
cv_ridge.plot(title = "Ridge Validation Curve for alpha")
plt.xlabel("alpha")
plt.ylabel("rmse")
cv_lasso = pd.Series(cv_lasso, index = lasso_alphas)
plt.subplot(252)
cv_lasso.plot(title = "Lasso Validation Curve")
plt.xlabel("alpha")
plt.ylabel("rmse")
cv_elasticNet = pd.Series(cv_elasticNet, index = lasso_alphas)
plt.subplot(253)
cv_elasticNet.plot(title = "ElasticNet Validation Curve for alpha")
plt.xlabel("alpha")
plt.ylabel("rmse")
plt.subplot(254)
cv_ridge.plot().set_xlim([0, 0.002])
cv_lasso.plot()
cv_elasticNet.plot()
plt.legend(labels=['Ridge','Lasso','ElasticNet'])
plt.title('Models Validation Curves for alpha')
plt.xlabel("alpha")
plt.ylabel("rmse")
cv_elasticNet2 = pd.Series(cv_elasticNet2, index = ratios)
plt.subplot(255)
cv_elasticNet2.plot(title = "ElasticNet Validation Curve for ratio")
plt.xlabel("ratio")
plt.ylabel("rmse")
plt.tight_layout()
Figure 8.1: hyperparameter tuning for Ridge, Lasso, and Elastic Net
ridgeCV = RidgeCV(alphas=alphas, cv=kf)
ridgeCV.fit(X_train, y_train)
# summarize chosen configuration
print('alpha: %f' % ridgeCV.alpha_)
lassoCV = LassoCV(alphas=alphas, cv=kf)
lassoCV.fit(X_train, y_train)
# summarize chosen configuration
print('alpha: %f' % lassoCV.alpha_)
Figure 8.4: Elastic Net Regression
elasticNetCV = ElasticNetCV(l1_ratio=ratios, alphas=lasso_alphas, cv=kf)
elasticNetCV.fit(X_train, y_train)
# summarize chosen configuration
print('alpha: %f' % elasticNetCV.alpha_)
print('l1_ratio_: %f' % elasticNetCV.l1_ratio_)
enet =ElasticNet(l1_ratio=.99, alpha=0.00070,random_state=321)
score = cv_rmse(enet,X_train, y_train)
print("ElasticNet: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['enet'] = (score.mean(), score.std())
# Elastic Net Regression
#enet = make_pipeline(RobustScaler(), ElasticNetCV(l1_ratio=.99,alphas =alphas, cv=kf))
#score = cv_rmse(enet,X_train, y_train)
#print("ElasticNet: {:.4f} ({:.4f})".format(score.mean(), score.std()))
#scores['enet'] = (score.mean(), score.std())
enet =enet.fit(X_train, y_train)
enet_test = enet.predict(X_test)
x_test_enet = pd.DataFrame(enet_test, columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_enet['Log1p_SalePrice'])))
# Plot the predictions for each model
sns.set_style("white")
fig = plt.figure(figsize=(24, 12))
ax = sns.pointplot(x=list(scores.keys()), y=[score for score, _ in scores.values()], markers=['o'], linestyles=['-'])
for i, score in enumerate(scores.values()):
ax.text(i, score[0] + 0.002, '{:.6f}'.format(score[0]), horizontalalignment='left', size='large', color='black', weight='semibold')
plt.ylabel('Score (RMSE)', size=20, labelpad=12.5)
plt.xlabel('Model', size=20, labelpad=12.5)
plt.tick_params(axis='x', labelsize=13.5)
plt.tick_params(axis='y', labelsize=12.5)
plt.title('Scores of Models', size=20)
plt.show()
df_test_enet = df_test[df_test.columns.drop(list(df_test.filter(regex='SalePrice')))]
df_test_enet['Log1p_SalePrice'] = enet.predict(df_test_enet)
df_test_enet['SalePrice'] = np.expm1(df_test_enet['Log1p_SalePrice'])
df_test_enet[['Id', 'SalePrice']].to_csv('ENET_SalePrice.csv', index=False)
#files.download('ENET_SalePrice.csv') #0.12749
Figure 9.6: Elastic Net Model in Kaggle
# Fix outleir predictions
submission = df_test[df_test.columns.drop(list(df_test.filter(regex='SalePrice')))]
submission['SalePrice'] = 1/4*(df_test_svr['SalePrice'] + df_test_ridge['SalePrice']+df_test_lasso['SalePrice']+df_test_enet['SalePrice'])
q1 = submission['SalePrice'].quantile(0.0045)
q2 = submission['SalePrice'].quantile(0.99)
print(q1, q2)
print(submission.head())
print(submission[['Id', 'SalePrice']].describe())
submission[['Id', 'SalePrice']].to_csv('Blended_SalePrice.csv', index=False)
#files.download('Blended_SalePrice.csv')
print(df_test_ridge['SalePrice'].describe())
# Fix outleir predictions
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x > q1 else x*0.71)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x < q2 else x*1.18)
print(submission[['Id', 'SalePrice']].describe())
#Scale predictions
submission['SalePrice'] = submission['SalePrice']/1.005472
print(submission[['Id', 'SalePrice']].describe())
submission[['Id', 'SalePrice']].to_csv('Blended2_SalePrice.csv', index=False)
#files.download('Blended2_SalePrice.csv')
print(df_test_junk['SalePrice'].describe())
print(df_test_linear['SalePrice'].describe())
print(df_test_linearPCA['SalePrice'].describe())
print(df_test_svr['SalePrice'].describe())
print(df_test_ridge['SalePrice'].describe())
print(df_test_lasso['SalePrice'].describe())
print(df_test_enet['SalePrice'].describe())